In [1]:
%matplotlib inline
import os
import pathlib
import arrow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dota.helpers as h
from dota import api
import dota.sql.orm as o
data_dir = pathlib.Path(os.path.expanduser('~/sandbox/dota/data/pro')) games = cached_games(data_dir)
_680 = arrow.get(1381989600)
pbs = (api.DetailsResponse.from_json(str(data_dir) + '/' + str(x) + '.json') for x in games) pbs = (dr for dr in pbs if getattr(dr, 'picks_bans') is not None and dr.start_time > _680) df = pd.concat(x.picks_bans for x in pbs) df['hero_id'] = df.hero_id.replace(api._hero_id_to_names) df.head()
fig, axes = plt.subplots(figsize=(20, 40), ncols=3) either.plot(kind='barh', ax=axes[0], label='Either') picks.plot(kind='barh', ax=axes[1], label='Picks') bans.plot(kind='barh', ax=axes[2], label='Bans')
In [2]:
def load_frame(data_dir):
if isinstance(data_dir, str):
data_dir = pathlib.Path(os.path.expanduser(data_dir))
games = h.cached_games(data_dir)
_680 = arrow.get(1381989600) # TODO: take as arg
pbs = (api.DetailsResponse.from_json(str(x)) for x in games)
pbs = (dr for dr in pbs if getattr(dr, 'picks_bans') is not None and dr.start_time > _680
and dr.game_mode == 2)
dfs = []
for dr in pbs:
df = dr.picks_bans.copy()
df.loc[df.team == 0, 'team_name'] = getattr(dr, 'radiant_name', np.nan)
df.loc[df.team == 0, 'team_id'] = getattr(dr, 'radiant_team_id', np.nan)
df.loc[df.team == 1, 'team_name'] = getattr(dr, 'dire_name', np.nan)
df.loc[df.team == 1, 'team_id'] = getattr(dr, 'dire_team_id', np.nan)
df['match_id'] = dr.match_id
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)
df = df.replace({None: np.nan})
df = h.pb_only_complete_drafts(df)
df['team_id_f'], team_id_labels = pd.factorize(df.team_id)
df['hero_id_f'], hero_id_labels = pd.factorize(df.hero_id)
return df
In [3]:
df = load_frame('~/sandbox/dota/data/pro/')
In [4]:
df.head(10)
Out[4]:
We can't just plug in hero_id as a feature since the integers imply an ordering.
In reality hero_id=0 is no "closer" to hero_id=1 than to hero_id=99.
We'll try out a feature hashing or dict vectorizer.
In [6]:
from sklearn.feature_extraction import DictVectorizer
api._hero_id_to_names
Out[6]:
In [396]:
# predict first ban by banner and bannee teams
endog = df.query('order == 0')['hero_id_f']
exog = df.query('order == 0')[['team', 'team_id_f']]
exog['team_id_a'] = df.loc[endog.index + 1, 'team_id_f'].values # labels are off by one
exog = exog[(exog >= 0).all(1)]
good_idx = exog.index
y = endog.loc[good_idx].values
X = exog.loc[good_idx].values
In [397]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
# set a prior to be the average for each hero
# sorted by hero_id
prior = pd.value_counts(y).div(len(y)).sort_index().values
clf = MultinomialNB(fit_prior=True, class_prior=prior)
clf.fit(X, y)
Out[397]:
In [398]:
print("Score {}".format(clf.score(X, y)))
In [399]:
fig, ax = plt.subplots(figsize=(8, 10))
actual = pd.value_counts(y)
expected = pd.value_counts(clf.predict(X))
cts = pd.concat([actual, expected], axis=1, keys=['actual', 'predicted']).fillna(0)
cts.plot(kind='barh', ax=ax)
Out[399]:
In [400]:
# Let's try CV
from sklearn.cross_validation import train_test_split, KFold
In [401]:
kf = KFold(len(y), n_folds=10)
In [402]:
scores = []
for train_idx, test_idx in kf:
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# don't recompute prior (I think)
clf = MultinomialNB(fit_prior=True, class_prior=prior)
clf.fit(X, y)
scores.append(clf.score(X_test, y_test))
print(np.mean(scores))
In [408]:
from sklearn.lda import LDA
scores = []
prior = prior / prior.sum()
for train_idx, test_idx in kf:
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# don't recompute prior (I think)
clf = LDA(len(np.unique(y)), priors=prior)
clf.fit(X, y)
scores.append(clf.score(X_test, y_test))
print(np.mean(scores))
The predictors are now
and the response is the hero_id.
The priors should be recomputed each time. Put zero weight on previously picked / banned.
In [370]:
# TODO the classifier will care (but shouldn't) about the order.
# I don't care if NP was banned first or second, just that it was
# banned. I spose I could map {pb up till now} -> Z.
# how many permutations are there? maybe just do the ones seen.......
def make_frame(data, order=0):
"""
Construct the endog and exog arrays.
Parameters
----------
data : DataFrame
order : int
item to pick for
Results
-------
endog : Series
exog : DataFrame
"""
def wrapper(df, order):
"""
apply the 3 functions to each df in g.
Combine into a (1 x d) Series / DataFrame
"""
hero_id = df[df.order == 0]['hero_id_f'].values[0]
match_id = df['match_id'].iloc[0]
team_id = h.pb_team_id(df, order=order)
opp_id = h.pb_opponent_id(df, order=order)
previous_pbs = h.pb_previous_pbs(df, order=order)
previous_pbs.index = [match_id]
temp = pd.DataFrame({'hero_id_f': hero_id, 'team_id': team_id,
'opp_id': opp_id}, index=[match_id])
res = pd.concat([temp, previous_pbs], axis=1)
return res
g = df.groupby('match_id', as_index=False)
res = g.apply(wrapper, order=order)
# ensurce column order
cols = res.columns
sub = ['hero_id_f', 'team_id', 'opp_id']
res = res[sub + (cols - sub).tolist()]
return res
In [10]:
df19 = make_frame(df, order=19)
In [349]:
pool = df.hero_id_f.unique()
def available_heros(df, pool):
"""
Find the heros that have not been seen already.
Parameters
----------
df : DataFrame
index is match_id. mathes previous pbs on cols starting
with pb_
Returns
-------
unseen : DataFrame
index is the match_ids from ``df``. Columns are ...
"""
cols = [x for x in df.columns if x.startswith('pb_')]
sub = df[cols]
p = [np.setdiff1d(pool, x[1].values) for x in sub.iterrows()]
res = pd.DataFrame(p, index=sub.index)
return res
def unavailable_heros(df):
"""
Find the heros that have been seen already.
Parameters
----------
df : DataFrame
index is match_id. mathes previous pbs on cols starting
with pb_
Returns
-------
seen : dict
{match_id : [seen hero_id_f]}
"""
cols = [x for x in df.columns if x.startswith('pb_')]
sub = df[cols]
return sub.T.to_dict(outtype='list')
def hero_priors(full_df, feature_df):
# get baseline priors off freqencies
# everyhitg is in terms of the factorized hero ids.
# then just set seen guys to zero
# and spread their mass over the unseen?
pool = full_df.hero_id_f.unique() # should be sorted anyway
pool.sort()
# want to get a [len(feature_df) x len(pool)] array of priors
baseline = pd.value_counts(full_df.hero_id_f, normalize=True,
sort=False).sort_index()
baseline = np.tile(baseline, (len(feature_df), 1))
baseline = pd.DataFrame(baseline, index=feature_df.index,
columns=pool)
seen = unavailable_heros(feature_df)
updated = baseline.copy()
# just set to zero now and renormalize later.
for k, v in baseline.iterrows():
z_idx = v.index[v.index.isin(seen[k])]
updated.loc[k, z_idx] = 0
updated = updated.div(updated.sum(1), axis=0) # renormalize
return updated
In [350]:
priors = hero_priors(df, df19)
In [351]:
priors
Out[351]:
In [459]:
df5 = make_frame(df, order=5)
priors = hero_priors(df, df5)
In [490]:
y = df5.hero_id_f.values.ravel()
X = df5[['team_id', 'opp_id']].values#, 'pb_0', 'pb_1', 'pb_2', 'pb_3', 'pb_4']].values
In [455]:
### hrmf
priors = priors.sum(0).div(priors.sum(0).sum())
priors = priors[priors.index.isin(np.unique(y))]
priors = priors / priors.sum()
In [469]:
# meh
#
clf = LDA(priors=priors.values)
clf.fit(X, y)
clf.score(X, y)
Out[469]:
In [ ]:
scores = []
for train_idx, test_idx in kf:
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# don't recompute prior (I think)
clf = MultinomialNB(fit_prior=True, class_prior=prior)
clf.fit(X, y)
scores.append(clf.score(X_test, y_test))
print(np.mean(scores))
In [494]:
from sklearn import tree
scores = []
for train_idx, test_idx in KFold(len(y), n_folds=5):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
scores.append(clf.score(X_test, y_test))
scores
Out[494]:
In [ ]:
In [505]:
clf = tree.DecisionTreeClassifier(max_depth=3)
clf.fit(X_train, y_train)
scores.append(clf.score(X_test, y_test))
with open("tree.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
In [506]:
!dot -Tjpg tree.dot -o tree.jpg
In [509]:
from IPython.display import display, display_jpeg, Image
Image('tree.jpg')
Out[509]:
In [500]:
from sklearn.grid_search import GridSearchCV
In [502]:
tree.
Out[502]:
In [ ]:
param_grid = {'max_depth': [1, 3, 5, 7, 100]}
GridSearchCV